library(tidyverse)
library(knitr)
library(readxl)# use to import .xlsx files
library(ggplot2)
library(ggrepel)
library(GGally)
library(leaflet)
library(sf)
library(RColorBrewer)
library(reshape2)

Load the dataset, covid-regions.csv dataset.

regions<-read_csv("./covid-regions.csv") 

head(regions)
## # A tibble: 6 × 19
##    ...1 date                state region_code region   lat  long hospi…¹ inten…²
##   <dbl> <dttm>              <chr>       <dbl> <chr>  <dbl> <dbl>   <dbl>   <dbl>
## 1     1 2020-05-03 17:00:00 ITA            13 Abruz…  42.4  13.4     300      16
## 2     2 2020-05-03 17:00:00 ITA            17 Basil…  40.6  15.8      48       3
## 3     3 2020-05-03 17:00:00 ITA             4 P.A. …  46.5  11.4     109      11
## 4     4 2020-05-03 17:00:00 ITA            18 Calab…  38.9  16.6      95       4
## 5     5 2020-05-03 17:00:00 ITA            15 Campa…  40.8  14.3     455      30
## 6     6 2020-05-03 17:00:00 ITA             8 Emili…  44.5  11.3    1997     197
## # … with 10 more variables: total_hospitalized <dbl>, home_quarantine <dbl>,
## #   total_confirmed_cases <dbl>, variation_total_confirmed <dbl>,
## #   new_confirmed_cases <dbl>, recovered <dbl>, deaths <dbl>,
## #   total_cases <dbl>, swabs_made <dbl>, casi_testati <dbl>, and abbreviated
## #   variable names ¹​hospitalized_with_symptoms, ²​intensive_care

Problem 1

Each group member should create one meaningful question and answer those questions using any descriptive data analysis methods.(Ex:If your group has 4 members, you should have 4 questions)


Name of the group member who completed this task: Paul Huffman

What is the relation between age, gender, and deaths?

# Read the csv file
covid_data <- read.csv("covid-age.csv")

covid_melt <- melt(covid_data, id.vars = c("age_classes"), 
                   measure.vars = c("male_deaths", "female_deaths"), 
                   variable.name = "gender", 
                   value.name = "deaths")

ggplot(covid_melt, aes(x = age_classes, y = deaths, fill = gender)) + 
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Relation between Gender, Age and Deaths",
       x = "Age Classes", y = "Number of Deaths", fill = "Gender") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5), legend.position = "top") + scale_fill_discrete(labels=c("Male Deaths", "Female Deaths"))

From the above graph it is clear that as age increases so does the number of deaths. It also looks like men are more likely to die from covid as women. The only place where that is not true is in the >90 age group, but I think that is because there are so few people in that age group that it is not a good representation of the population. Also there are more deaths for women than men in that same group, but that is because there are significantly more women who live past 90 than men. Acording to census.gov women over 90 outnumber men over 90 at a rate of about 3 to 1.


Name of the group member who completed this task: Ryan Heiert

Name of the group member who completed this task: Justin Bennet

What are the 5 descriptive statistics of the total deaths from covid?

boxplot(regions$deaths,
main = "Mean Deaths for Covid in Italy",
xlab = "Total Deaths",
ylab = "Death Count",
col = "orange",
border = "brown",
horizontal = TRUE,
notch = TRUE
)
## Warning in (function (z, notch = FALSE, width = NULL, varwidth = FALSE, : some
## notches went outside hinges ('box'): maybe set notch=FALSE


Name of the group member who completed this task: Christian Lane

What is the relationship between total swabs made, and total confirmed cases? Can you visualize that information on a map?

Confirmed Cases as % of Total Swabs Made by Region

# Create two new columns representing the percentage of total confirmed cases out of total cases
# and the percentage of total confirmed cases out of total swabs made
regions = regions %>% 
  mutate(confirmed_percent_of_swabs = total_confirmed_cases / swabs_made * 100)



leaflet(regions) %>% addTiles() %>% addCircles(lng = ~long, lat = ~lat, weight = 1,
                                               radius = ~confirmed_percent_of_swabs * 9000)

The radii of the circles on the map correspond to the percentage of swabs that turned into confirmed cases. We can see that around the cities of Milano and Torino, the percentage of swabs that turned into confirmed cases is relatively higher. In the nortwestern region of Italy in general, a higher percentage of test swabs are turning into confirmed cases than in the other regions of Italy. The northeastern region of Italy seems to have the lowest percentage conversion, though.


Problem 2

Use “Circles” option in leaflet package to visualize the Total number of cases. Label the circles by region (Check Module 10 geospatial data visualization)

n2<-leaflet(regions) %>% addTiles() %>% addCircles(lng = ~long, lat = ~lat, weight = 1,
    radius = ~total_cases, popup = ~region)

n2

Problem 3 (Part1)

Create a parallel plot as given in the project instruction.

Hint: you can use ggplot, GGally, and ggrepel packages

ggparcoord(data=regions,
           columns=8:19,
           groupColumn="region") +
  theme(axis.text.x=element_text(angle=90),axis.text = element_text(size = 8)) +
  geom_point(size=NA)+geom_text_repel(aes(label=`...1`))

Problem 3 (Part2)

Using above plot identify any regions(four regions) which shows deviation from the rest of the regions. Extract the information about those regions. (create a new dataframe, you will use it in next question)

# for some reason it would not allow a collection of length 4, I had to separate like this

newRegions = regions %>% filter(`region` == c("Piemonte", "Lombardia"))
newRegions2 = regions %>% filter(`region` == c( "Veneto", "Emilia-Romagna"))

newRegions = bind_rows(newRegions, newRegions2)

Problem 3 (Part3)

Use appropriate markers in the leaflet package to visualize the above identified regions on a map. Label the markers with the region name.

n3<-leaflet(newRegions) %>% addTiles() %>% addMarkers(~long, ~lat,  label = ~region)

n3

Problem 4(Part1)

Using mutate function in tidyverse create new data column as \(\mbox{death_per=(deaths/total_cases)*100}\) for each region and call it as “death_per”. Add death_per data to the regions dataset. Use mutate function in tidyverse.

regions_death_per<-mutate(regions, death_per=(deaths/total_cases)*100)

summary(regions_death_per)
##       ...1         date                        state            region_code   
##  Min.   : 1   Min.   :2020-05-03 17:00:00   Length:21          Min.   : 1.00  
##  1st Qu.: 6   1st Qu.:2020-05-03 17:00:00   Class :character   1st Qu.: 5.00  
##  Median :11   Median :2020-05-03 17:00:00   Mode  :character   Median :10.00  
##  Mean   :11   Mean   :2020-05-03 17:00:00                      Mean   :10.19  
##  3rd Qu.:16   3rd Qu.:2020-05-03 17:00:00                      3rd Qu.:15.00  
##  Max.   :21   Max.   :2020-05-03 17:00:00                      Max.   :20.00  
##     region               lat             long       hospitalized_with_symptoms
##  Length:21          Min.   :38.12   Min.   : 7.32   Min.   :   8              
##  Class :character   1st Qu.:41.13   1st Qu.:11.12   1st Qu.:  95              
##  Mode  :character   Median :43.62   Median :12.39   Median : 383              
##                     Mean   :43.05   Mean   :12.23   Mean   : 821              
##                     3rd Qu.:45.43   3rd Qu.:13.77   3rd Qu.: 627              
##                     Max.   :46.50   Max.   :16.87   Max.   :6609              
##  intensive_care   total_hospitalized home_quarantine total_confirmed_cases
##  Min.   :  1.00   Min.   :   9.0     Min.   :   33   Min.   :  109        
##  1st Qu.: 10.00   1st Qu.: 102.0     1st Qu.:  587   1st Qu.:  689        
##  Median : 29.00   Median : 412.0     Median : 1791   Median : 2203        
##  Mean   : 71.48   Mean   : 892.5     Mean   : 3878   Mean   : 4770        
##  3rd Qu.: 95.00   3rd Qu.: 695.0     3rd Qu.: 2944   3rd Qu.: 4385        
##  Max.   :532.00   Max.   :7141.0     Max.   :29785   Max.   :36926        
##  variation_total_confirmed new_confirmed_cases   recovered         deaths     
##  Min.   :-278              Min.   :  0.00      Min.   :   98   Min.   :   22  
##  1st Qu.: -41              1st Qu.:  6.00      1st Qu.:  795   1st Qu.:  138  
##  Median : -13              Median : 25.00      Median : 1590   Median :  364  
##  Mean   : -25              Mean   : 66.14      Mean   : 3888   Mean   : 1375  
##  3rd Qu.:   1              3rd Qu.: 53.00      3rd Qu.: 3363   3rd Qu.:  927  
##  Max.   : 259              Max.   :526.00      Max.   :26371   Max.   :14231  
##   total_cases      swabs_made      casi_testati    confirmed_percent_of_swabs
##  Min.   :  301   Min.   :  7075   Min.   :  6046   Min.   :0.4714            
##  1st Qu.: 1394   1st Qu.: 38835   1st Qu.: 24662   1st Qu.:1.8076            
##  Median : 4144   Median : 64412   Median : 42281   Median :2.9057            
##  Mean   :10034   Mean   :102561   Mean   : 69377   Mean   :3.4894            
##  3rd Qu.: 8359   3rd Qu.:150912   3rd Qu.:114354   3rd Qu.:4.5896            
##  Max.   :77528   Max.   :410857   Max.   :247176   Max.   :9.0809            
##    death_per     
##  Min.   : 4.878  
##  1st Qu.: 7.899  
##  Median : 9.668  
##  Mean   :10.152  
##  3rd Qu.:11.491  
##  Max.   :18.356
regions<-mutate(regions, death_per=(deaths/total_cases)*100)

Problem 4 (Part2)

In this problem, your goal is to create a Choropleth map to visualize the Death Percentage for each region. The first steps of the data processing is given. Use the Italy_regions dataframe and complete the code.

Hint: Module 10 geospatial visualization

Italy_regions <- read_sf('shapefile')

name_1=regions$region
regions$name_1<-name_1

 
 # combining data and shapfile 
 is.element(regions$name_1, Italy_regions$name_1) %>%
  all()
## [1] FALSE
 Italy_regions <- merge(Italy_regions, regions, by = 'name_1', all.x = F)
 head(Italy_regions)
## Simple feature collection with 6 features and 35 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 13.01941 ymin: 39.89581 xmax: 16.86736 ymax: 42.89574
## Geodetic CRS:  WGS 84
##       name_1 id_0 iso name_0 id_1 id_2   name_2 hasc_2 ccn_2 cca_2    type_2
## 1    Abruzzo  112 ITA  Italy    1    2 L'Aquila  IT.AQ     0    66 Provincia
## 2    Abruzzo  112 ITA  Italy    1    1   Chieti  IT.CH     0    69 Provincia
## 3    Abruzzo  112 ITA  Italy    1    3  Pescara  IT.PE     0    68 Provincia
## 4    Abruzzo  112 ITA  Italy    1    4   Teramo  IT.TE     0    67 Provincia
## 5 Basilicata  112 ITA  Italy    3   12  Potenza  IT.PZ     0    76 Provincia
## 6 Basilicata  112 ITA  Italy    3   11   Matera  IT.MT     0    77 Provincia
##   engtype_2 nl_name_2 varname_2 ...1                date state region_code
## 1  Province      <NA>    Aquila    1 2020-05-03 17:00:00   ITA          13
## 2  Province      <NA>      <NA>    1 2020-05-03 17:00:00   ITA          13
## 3  Province      <NA>      <NA>    1 2020-05-03 17:00:00   ITA          13
## 4  Province      <NA>      <NA>    1 2020-05-03 17:00:00   ITA          13
## 5  Province      <NA>      <NA>    2 2020-05-03 17:00:00   ITA          17
## 6  Province      <NA>      <NA>    2 2020-05-03 17:00:00   ITA          17
##       region      lat     long hospitalized_with_symptoms intensive_care
## 1    Abruzzo 42.35122 13.39844                        300             16
## 2    Abruzzo 42.35122 13.39844                        300             16
## 3    Abruzzo 42.35122 13.39844                        300             16
## 4    Abruzzo 42.35122 13.39844                        300             16
## 5 Basilicata 40.63947 15.80515                         48              3
## 6 Basilicata 40.63947 15.80515                         48              3
##   total_hospitalized home_quarantine total_confirmed_cases
## 1                316            1552                  1868
## 2                316            1552                  1868
## 3                316            1552                  1868
## 4                316            1552                  1868
## 5                 51             143                   194
## 6                 51             143                   194
##   variation_total_confirmed new_confirmed_cases recovered deaths total_cases
## 1                       -11                  32       798    330        2996
## 2                       -11                  32       798    330        2996
## 3                       -11                  32       798    330        2996
## 4                       -11                  32       798    330        2996
## 5                         3                   6       167     25         386
## 6                         3                   6       167     25         386
##   swabs_made casi_testati confirmed_percent_of_swabs death_per
## 1      40699        29788                   4.589793 11.014686
## 2      40699        29788                   4.589793 11.014686
## 3      40699        29788                   4.589793 11.014686
## 4      40699        29788                   4.589793 11.014686
## 5      14210        14210                   1.365236  6.476684
## 6      14210        14210                   1.365236  6.476684
##                         geometry
## 1 MULTIPOLYGON (((13.40441 42...
## 2 MULTIPOLYGON (((14.25403 42...
## 3 MULTIPOLYGON (((14.07483 42...
## 4 MULTIPOLYGON (((13.91542 42...
## 5 MULTIPOLYGON (((15.71736 39...
## 6 MULTIPOLYGON (((16.7218 40....
paletteNum <- colorNumeric('Blues', domain = Italy_regions$death_per)

leaflet() %>%
  addProviderTiles(providers$CartoDB.PositronNoLabels) %>%
  addPolygons(data = Italy_regions,
              fillColor = ~paletteNum(death_per),
              fillOpacity = 0.7,
              smoothFactor = .3,
              color = "#BDBDC3",
              weight = 1) %>%
  addLegend(pal = paletteNum,
            values = Italy_regions$death_per,
            title = "2020 Percentage  of Deaths<br> by Region in Italy (Covid-19)",
            opacity = 0.7,
            position = 'bottomleft')